En este notebook vamos a usar el (Mushroom Classification)[https://www.kaggle.com/uciml/mushroom-classification] dataset. Se trata de predecir si un hongo es venenoso por determinados features.
Intentaremos y compararemos distintos metodos para extraer features y medir su capacidad predictiva.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import time
import os
df = pd.read_csv('mushrooms.csv')
pd.options.display.max_columns = None
df.head()
percent_missing = df.isnull().sum() * 100 / len(df)
missing_values = pd.DataFrame({'percent_missing': percent_missing})
missing_values.sort_values(by ='percent_missing' , ascending=False)
sns.set(style="ticks")
f = sns.countplot(x="class", data=df, palette="bwr")
plt.show()
df['class'].value_counts()
X = df.drop(['class'], axis = 1)
Y = df['class']
X = pd.get_dummies(X, prefix_sep='_')
X.head()
len(X.columns)
Y = LabelEncoder().fit_transform(Y)
#np.set_printoptions(threshold=np.inf)
Y
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
X = StandardScaler().fit_transform(X)
def forest_test(X, Y):
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.30, random_state = 101)
start = time.process_time()
trainedforest = RandomForestClassifier(n_estimators=700).fit(X_Train,Y_Train)
print(time.process_time() - start)
predictionforest = trainedforest.predict(X_Test)
print(confusion_matrix(Y_Test,predictionforest))
print(classification_report(Y_Test,predictionforest))
def complete_test_2D(X, Y, plot_name = ''):
Small_df = pd.DataFrame(data = X, columns = ['C1', 'C2'])
Small_df = pd.concat([Small_df, df['class']], axis = 1)
Small_df['class'] = LabelEncoder().fit_transform(Small_df['class'])
forest_test(X, Y)
data = []
for clas, col, name in zip((1, 0), ['red', 'green'], ['Poisonous', 'Edible']):
trace = dict(
type='scatter',
x= Small_df.loc[Small_df['class'] == clas, 'C1'],
y= Small_df.loc[Small_df['class'] == clas, 'C2'],
mode= 'markers',
name= name,
marker=dict(
color=col,
size=12,
line=dict(
color='rgba(217, 217, 217, 0.14)',
width=0.5),
opacity=0.8)
)
data.append(trace)
layout = dict(
title= plot_name + ' 2D Dimensionality Reduction',
xaxis=dict(title='C1', showline=False),
yaxis=dict(title='C2', showline=False)
)
fig = dict(data=data, layout=layout)
iplot(fig)
def complete_test_3D(X, Y, plot_name = ''):
Small_df = pd.DataFrame(data = X, columns = ['C1', 'C2', 'C3'])
Small_df = pd.concat([Small_df, df['class']], axis = 1)
Small_df['class'] = LabelEncoder().fit_transform(Small_df['class'])
forest_test(X, Y)
data = []
for clas, col, name in zip((1, 0), ['red', 'green'], ['Poisonous', 'Edible']):
trace = dict(
type='scatter3d',
x= Small_df.loc[Small_df['class'] == clas, 'C1'],
y= Small_df.loc[Small_df['class'] == clas, 'C2'],
z= Small_df.loc[Small_df['class'] == clas, 'C3'],
mode= 'markers',
name= name
)
data.append(trace)
layout = {
"scene": {
"xaxis": {
"title": "C1",
"showline": False
},
"yaxis": {
"title": "C2",
"showline": False
},
"zaxis": {
"title": "C3",
"showline": False
}
},
"title": plot_name + ' 3D Dimensionality Reduction'
}
fig = dict(data=data, layout=layout)
iplot(fig)
forest_test(X, Y)
El Analisis de Componentes Principales (Principal Component Analysis) es un procedimiento estadistico que construye una transformacion ortongonal para convertir un conjunto de observaciones correlacionadas a un espacio de dimension mas chica generado por variables no correlacionadas (llamdas componentes principales).
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
PCA_df = pd.DataFrame(data = X_pca, columns = ['PC1', 'PC2'])
PCA_df = pd.concat([PCA_df, df['class']], axis = 1)
PCA_df['class'] = LabelEncoder().fit_transform(PCA_df['class'])
PCA_df.head()
figure(num=None, figsize=(8, 8), dpi=80, facecolor='w', edgecolor='k')
classes = [1, 0]
colors = ['r', 'g']
for clas, color in zip(classes, colors):
plt.scatter(PCA_df.loc[PCA_df['class'] == clas, 'PC1'], PCA_df.loc[PCA_df['class'] == clas, 'PC2'], c = color)
plt.xlabel('Principal Component 1', fontsize = 12)
plt.ylabel('Principal Component 2', fontsize = 12)
plt.title('2D PCA', fontsize = 15)
plt.legend(['Poisonous', 'Edible'])
plt.grid()
pca.explained_variance_ratio_
complete_test_2D(X_pca, Y, 'PCA')
var_ratio = pca.explained_variance_ratio_
cum_var_ratio = np.cumsum(var_ratio)
trace1 = dict(
type='bar',
x=['PC %s' %i for i in range(1,5)],
y=var_ratio,
name='Individual'
)
trace2 = dict(
type='scatter',
x=['PC %s' %i for i in range(1,5)],
y=cum_var_ratio,
name='Cumulative'
)
data = [trace1, trace2]
layout=dict(
title='Explained variance Ratio by each principal components',
yaxis=dict(
title='Explained variance ratio in percent'
),
annotations=list([
dict(
x=1.16,
y=1.05,
xref='paper',
yref='paper',
showarrow=False,
)
])
)
fig = dict(data=data, layout=layout)
iplot(fig)
from itertools import product
X_Reduced, X_Test_Reduced, Y_Reduced, Y_Test_Reduced = train_test_split(X_pca, Y, test_size = 0.30, random_state = 101)
trainedforest = RandomForestClassifier(n_estimators=700).fit(X_Reduced,Y_Reduced)
x_min, x_max = X_Reduced[:, 0].min() - 1, X_Reduced[:, 0].max() + 1
y_min, y_max = X_Reduced[:, 1].min() - 1, X_Reduced[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
Z = trainedforest.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z,cmap=plt.cm.coolwarm, alpha=0.4)
plt.scatter(X_Reduced[:, 0], X_Reduced[:, 1], c=Y_Reduced, s=20, edgecolor='k')
plt.xlabel('Principal Component 1', fontsize = 12)
plt.ylabel('Principal Component 2', fontsize = 12)
plt.title('Random Forest', fontsize = 15)
plt.show()
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)
complete_test_3D(X_pca, Y, 'PCA')
var_ratio = pca.explained_variance_ratio_
cum_var_ratio = np.cumsum(var_ratio)
trace1 = dict(
type='bar',
x=['PC %s' %i for i in range(1,5)],
y=var_ratio,
name='Individual'
)
trace2 = dict(
type='scatter',
x=['PC %s' %i for i in range(1,5)],
y=cum_var_ratio,
name='Cumulative'
)
data = [trace1, trace2]
layout=dict(
title='Explained variance Ratio by each principal components',
yaxis=dict(
title='Explained variance ratio in percent'
),
annotations=list([
dict(
x=1.16,
y=1.05,
xref='paper',
yref='paper',
showarrow=False,
)
])
)
fig = dict(data=data, layout=layout)
iplot(fig)
t-SNE es una tecnica no-lineal de reduccion de dimensionalidad. Funciona minimizando la divergencia (KL) entre las distribuciones conjuntas de los inputs en la dimension orignal del espacio y en la dimension reducida del espacio.
El hyperespacio original es modelado usando una distribucion normal, mientras que el reducido se modela con una t-Student. Esto es para evitar el desbalance en la distribucion de distancias de una vecindad al traducirla a una de dimension inferior.
from sklearn.manifold import TSNE
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
X_tsne = tsne.fit_transform(X)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
sns.scatterplot(
x=X_tsne[:,0], y=X_tsne[:,1],
hue=Y,
palette=sns.color_palette("hls", 2),
data=df,
legend="full",
alpha=0.3
)
complete_test_2D(X_tsne, Y, 't-SNE')
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
X_tsne = tsne.fit_transform(X)
complete_test_3D(X_tsne, Y, 't-SNE')
ICA es un metodo de reduccion de simensionalidad lineal que interpreta una señal como combinacion de varias fuentes e intenta identificar que parte de la señal completa corresponde a cada una, intentando eliminar el ruido.
from sklearn.decomposition import FastICA
ica = FastICA(n_components=2)
X_ica = ica.fit_transform(X)
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_ica.shape[1])
complete_test_2D(X_ica, Y, 'ICA')
ica = FastICA(n_components=3)
X_ica = ica.fit_transform(X)
complete_test_3D(X_ica, Y, 'ICA')
import umap
import kmapper as km
import sklearn
mapper = km.KeplerMapper(verbose=0)
X_tda = mapper.fit_transform(X, projection=TSNE(n_components=2, verbose=1, perplexity=8, n_iter=300))
graph = mapper.map(X_tda)
_ = mapper.visualize(graph, color_function=df["class"].map(lambda x: 0 if x=="e" else 1).values,
path_html="mushrooms_2d.html")
complete_test_2D(X_tda, Y, 'TDA')
X_tda = mapper.fit_transform(X, projection=TSNE(n_components=3, verbose=1, perplexity=12, n_iter=300))
complete_test_3D(X_tda, Y, 'TDA')
Los Autoencoders son una familia de algoritmos enfocados en resolver problemas de reduccion de dimensionalidad. La diferencia principal de estos algoritmos con el resto es que proveen un mecanismo no lineal de capturar tendencias tan abstractas en los datos como se quiera.
Existen varios tipos de autoencoders. Principalmente:
En este ejemplo veremos un Autoencoder basico (sin regularizacion o arquitectura especial). Que reduce los datos originales a 2 dimensiones representativas. Un auto encoder posee dos elementos:
Si los features originales son independientes, entonces el autoencoder no a poder reducir muy bien todo el contenido informatico en un espacio de dimension inferior.
Aqui usamos ReLu como la funcion de activacion para el "encoding" y Softmax para el "decoding". Esto es importante notarlo, porque si no hubieramos usado funciones no-lineales de activacion entonces el autoencoder hubiera aprendido a reducir usando una transformacion linear (dando un resultado similar al de PCA, por ejemplo).
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
input_layer = Input(shape=(X.shape[1],))
encoded = Dense(2, activation='relu')(input_layer)
decoded = Dense(X.shape[1], activation='softmax')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
X1, X2, Y1, Y2 = train_test_split(X, X, test_size=0.3, random_state=101)
autoencoder.fit(X1, Y1,
epochs=100,
batch_size=300,
shuffle=True,
verbose = 0,
validation_data=(X2, Y2))
encoder = Model(input_layer, encoded)
X_ae = encoder.predict(X)
complete_test_2D(X_ae, Y, 'AE')
input_layer = Input(shape=(X.shape[1],))
encoded = Dense(3, activation='relu')(input_layer)
decoded = Dense(X.shape[1], activation='softmax')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
X1, X2, Y1, Y2 = train_test_split(X, X, test_size=0.3, random_state=101)
autoencoder.fit(X1, Y1,
epochs=100,
batch_size=300,
shuffle=True,
verbose = 0,
validation_data=(X2, Y2))
encoder = Model(input_layer, encoded)
X_ae = encoder.predict(X)
complete_test_3D(X_ae, Y, 'AE')